snATAC-seq - Final visualisation and clustering

Blanca Pijuan-Sala

18 April 2019

In [2]:
import numpy as np
from matplotlib import rcParams
import matplotlib.pyplot as pl
import scanpy.api as sc
import pandas as pd
import matplotlib.pyplot as plt
import episcanpy.api as epi
import anndata as ad
import numpy as np

sc.settings.verbosity = 3  # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.settings.set_figure_params(dpi=100, color_map='viridis')  # low dpi (dots per inch) yields small inline figures
sc.logging.print_versions()
#Set working directory
wd = '/path/to/file/sample_pooled_preprocess_revision1/'
direc = wd
sc.settings.figdir = './plots/'
results_file="./write/20190416_snATACseq_embryo_revision01_doublets_cisTopic_50_100_filtered_final.h5ad"
/home/USSR/bp382/bin/python/anaconda3/lib/python3.6/site-packages/scanpy/api/__init__.py:6: FutureWarning: 

In a future version of Scanpy, `scanpy.api` will be removed.
Simply use `import scanpy as sc` and `import scanpy.external as sce` instead.

  FutureWarning
scanpy==1.4.4.post1 anndata==0.6.22.post1 umap==0.3.7 numpy==1.15.0 scipy==1.3.0 pandas==0.24.0 scikit-learn==0.20.2 statsmodels==0.10.1 python-igraph==0.7.1 louvain==0.6.1

Read files

In [3]:
##====== Read counts ======##

filename_data = direc + '11_matrix_afterClusterQC/embryo_revision1_allPeaks_afterClusterPeak.mat.bin.mtx'
filename_gene_names = direc + '11_matrix_afterClusterQC/embryo_revision1_allPeaks_passedQC_peakNames.txt'
filename_barcodes = direc + '11_matrix_afterClusterQC/embryo_revision1_allPeaks_passedQC_barcodeNames.xgi'
#filename_clustersAll = wd + 'data/cellTypes_20180215.txt'
#adata_all_cells = epi.tl.read_ATAC(filename_data, filename_barcodes, filename_gene_names,path_file='')

print('reading counts')
adata_all_cells = sc.read(filename_data, cache=True).transpose()
adata_all_cells.X = adata_all_cells.X.astype(np.int64)
print('reading genes')
adata_all_cells.var_names = np.genfromtxt(filename_gene_names, dtype='str')
print('reading cells')
adata_all_cells.obs_names = np.genfromtxt(filename_barcodes, dtype='str')
reading counts
WARNING: Your filename has more than two extensions: ['.mat', '.bin', '.mtx'].
Only considering the two last: ['.bin', '.mtx'].
WARNING: Your filename has more than two extensions: ['.mat', '.bin', '.mtx'].
Only considering the two last: ['.bin', '.mtx'].
... reading from cache file cache/home-USSR-codex-pipeline-Data-Rebecca-Blanca-sample_pooled_preprocess_revision1-11_matrix_afterClusterQC-embryo_revision1_allPeaks_afterClusterPeak.mat.bin.h5ad
reading genes
reading cells
In [4]:
adata_all_cells.shape
Out[4]:
(23838, 305187)

colour Palettes

In [5]:
#adata = sc.read(results_file)
In [6]:
##====== Create colour palette for gene expression profiles ======##

from matplotlib.colors import LinearSegmentedColormap
rmap = LinearSegmentedColormap.from_list(name='gene_cmap',
                                         colors=['lightgrey', 'thistle', 'red', 'darkred'])

cmap = LinearSegmentedColormap.from_list(name='gene_cmap',
                                         colors=["#BFBFBF","#6495ED","#000000"])

cisTopic with reads_in_peaks > 0.24

In [7]:
cells = np.genfromtxt(direc + "12_barcodeStats_celltypePeaks/embryo_revision1_readsPeaks24.xgi", dtype='str')
In [8]:
adata = adata_all_cells[np.array([str(i) in cells for i in adata_all_cells.obs_names]),:]
In [9]:
adata.shape
Out[9]:
(19453, 305187)
In [10]:
from scipy.io import mmread

cistopic = mmread(direc + '13_cisTopic/data/cisTopic_matrix_50_100_afterReadsInPeaksQC_24.mtx')
In [11]:
cistopic = cistopic.todense()
cistopic= cistopic.transpose()
cistopic.shape
Out[11]:
(19453, 100)
In [12]:
adata.obsm['X_pca'] = cistopic
In [13]:
adata.obs=pd.DataFrame(cistopic)
AnnData expects string indices for some functionality, but your first two indices are: RangeIndex(start=0, stop=2, step=1). 

UMAP visualisation and clustering

In [14]:
sc.pp.neighbors(adata)
computing neighbors
    using 'X_pca' with n_pcs = 100
    finished: added to `.uns['neighbors']`
    'distances', distances for each pair of neighbors
    'connectivities', weighted adjacency matrix (0:00:53)
In [15]:
sc.tl.umap(adata,random_state=4)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:40)
In [16]:
sc.tl.louvain(adata,random_state=7)
running Louvain clustering
    using the "louvain" package of Traag (2017)
    finished: found 18 clusters and added
    'louvain', the cluster labels (adata.obs, categorical) (0:00:07)
In [52]:
sc.pl.umap(adata, color=[ 'louvain'])

Plot cistopic scores

In [53]:
for i in adata.obs:
    sc.pl.umap(adata, color=[i])

Write files

In [111]:
#save UMAP
UMAP = pd.DataFrame(data=adata.obsm['X_umap'])
UMAP.to_csv(direc + '14_visualisation/data/snATACseq_embryo_revision1_reads_in_peaks_above_24_UMAP.csv', sep=',')

#save clusters

louvain = pd.DataFrame(data=adata.obs['louvain'])
louvain.to_csv(direc + '14_visualisation/data/snATACseq_embryo_revision1_reads_in_peaks_above_24_Louvain.csv', sep=',')

Count number of cells with regions accessible per cell type

In [65]:
mat = np.zeros((len(adata.var_names), len(np.unique(adata.obs['louvain']))))
count=0
for i in np.unique(adata.obs['louvain']):
    print(i)
    adata_endo = adata[[j == i for j  in adata.obs["louvain"]],:]
    numEndo = np.sum(adata_endo.X,axis=0).tolist()
    print(numEndo[0][0:5])
    mat[:,count] = numEndo[0]
    count+=1
    print(count)
0
[7.0, 8.0, 6.0, 12.0, 27.0]
1
1
[26.0, 11.0, 10.0, 9.0, 8.0]
2
10
[2.0, 4.0, 4.0, 0.0, 6.0]
3
11
[5.0, 4.0, 4.0, 4.0, 35.0]
4
12
[1.0, 1.0, 2.0, 1.0, 2.0]
5
13
[5.0, 2.0, 2.0, 2.0, 3.0]
6
14
[2.0, 9.0, 7.0, 0.0, 5.0]
7
15
[1.0, 1.0, 2.0, 3.0, 14.0]
8
16
[2.0, 0.0, 0.0, 1.0, 0.0]
9
17
[1.0, 0.0, 1.0, 1.0, 0.0]
10
2
[2.0, 2.0, 2.0, 0.0, 3.0]
11
3
[6.0, 5.0, 5.0, 6.0, 48.0]
12
4
[16.0, 9.0, 9.0, 12.0, 14.0]
13
5
[7.0, 9.0, 8.0, 29.0, 18.0]
14
6
[9.0, 5.0, 7.0, 7.0, 11.0]
15
7
[0.0, 7.0, 7.0, 4.0, 9.0]
16
8
[5.0, 7.0, 2.0, 4.0, 37.0]
17
9
[6.0, 0.0, 0.0, 0.0, 2.0]
18
In [66]:
MAT = pd.DataFrame(data=mat)
MAT
Out[66]:
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
0 7.0 26.0 2.0 5.0 1.0 5.0 2.0 1.0 2.0 1.0 2.0 6.0 16.0 7.0 9.0 0.0 5.0 6.0
1 8.0 11.0 4.0 4.0 1.0 2.0 9.0 1.0 0.0 0.0 2.0 5.0 9.0 9.0 5.0 7.0 7.0 0.0
2 6.0 10.0 4.0 4.0 2.0 2.0 7.0 2.0 0.0 1.0 2.0 5.0 9.0 8.0 7.0 7.0 2.0 0.0
3 12.0 9.0 0.0 4.0 1.0 2.0 0.0 3.0 1.0 1.0 0.0 6.0 12.0 29.0 7.0 4.0 4.0 0.0
4 27.0 8.0 6.0 35.0 2.0 3.0 5.0 14.0 0.0 0.0 3.0 48.0 14.0 18.0 11.0 9.0 37.0 2.0
5 5.0 7.0 3.0 4.0 3.0 1.0 3.0 1.0 4.0 0.0 2.0 1.0 3.0 3.0 9.0 4.0 2.0 4.0
6 5.0 5.0 2.0 1.0 2.0 2.0 3.0 0.0 0.0 0.0 1.0 3.0 2.0 5.0 5.0 0.0 2.0 1.0
7 46.0 107.0 36.0 8.0 50.0 24.0 32.0 1.0 0.0 2.0 2.0 17.0 69.0 23.0 14.0 37.0 13.0 8.0
8 2.0 4.0 4.0 0.0 0.0 2.0 4.0 2.0 0.0 0.0 1.0 8.0 7.0 5.0 5.0 5.0 1.0 4.0
9 9.0 6.0 30.0 4.0 13.0 4.0 9.0 2.0 0.0 0.0 2.0 6.0 8.0 7.0 3.0 7.0 4.0 3.0
10 10.0 12.0 2.0 5.0 2.0 2.0 3.0 1.0 0.0 1.0 1.0 6.0 8.0 10.0 4.0 0.0 5.0 4.0
11 22.0 5.0 1.0 9.0 0.0 1.0 1.0 9.0 0.0 0.0 2.0 22.0 4.0 29.0 4.0 6.0 12.0 0.0
12 7.0 12.0 3.0 5.0 1.0 1.0 3.0 0.0 0.0 2.0 2.0 8.0 1.0 9.0 13.0 21.0 1.0 7.0
13 3.0 8.0 2.0 2.0 0.0 0.0 2.0 4.0 0.0 2.0 0.0 4.0 1.0 4.0 5.0 2.0 5.0 2.0
14 4.0 5.0 0.0 2.0 1.0 1.0 0.0 0.0 0.0 0.0 1.0 4.0 2.0 7.0 7.0 4.0 4.0 1.0
15 17.0 22.0 10.0 9.0 0.0 3.0 4.0 5.0 0.0 0.0 2.0 18.0 21.0 35.0 9.0 12.0 12.0 2.0
16 11.0 13.0 9.0 3.0 10.0 3.0 6.0 1.0 0.0 0.0 2.0 6.0 12.0 9.0 10.0 9.0 5.0 2.0
17 10.0 3.0 1.0 12.0 3.0 1.0 0.0 7.0 0.0 0.0 1.0 46.0 6.0 12.0 3.0 4.0 22.0 4.0
18 7.0 1.0 5.0 4.0 0.0 3.0 3.0 1.0 1.0 0.0 1.0 10.0 2.0 5.0 2.0 3.0 5.0 1.0
19 6.0 9.0 4.0 8.0 2.0 2.0 4.0 5.0 0.0 1.0 4.0 8.0 14.0 7.0 8.0 6.0 5.0 6.0
20 9.0 15.0 12.0 6.0 14.0 3.0 10.0 0.0 0.0 1.0 1.0 9.0 9.0 9.0 12.0 6.0 7.0 4.0
21 13.0 7.0 3.0 10.0 2.0 3.0 2.0 3.0 0.0 2.0 1.0 10.0 7.0 15.0 8.0 3.0 14.0 2.0
22 39.0 12.0 4.0 32.0 4.0 3.0 2.0 7.0 1.0 3.0 2.0 49.0 15.0 44.0 9.0 7.0 17.0 3.0
23 18.0 15.0 5.0 1.0 1.0 3.0 3.0 3.0 1.0 4.0 2.0 3.0 23.0 8.0 12.0 6.0 9.0 2.0
24 16.0 14.0 5.0 10.0 1.0 4.0 2.0 1.0 1.0 0.0 2.0 4.0 23.0 3.0 8.0 1.0 6.0 0.0
25 8.0 5.0 0.0 3.0 1.0 3.0 0.0 3.0 5.0 0.0 1.0 6.0 10.0 5.0 7.0 3.0 8.0 3.0
26 6.0 6.0 5.0 2.0 2.0 2.0 1.0 2.0 0.0 0.0 1.0 7.0 7.0 6.0 4.0 1.0 3.0 1.0
27 9.0 7.0 10.0 16.0 2.0 1.0 3.0 2.0 0.0 1.0 3.0 8.0 10.0 7.0 14.0 5.0 21.0 1.0
28 18.0 7.0 6.0 18.0 6.0 3.0 1.0 1.0 0.0 2.0 1.0 17.0 15.0 14.0 8.0 7.0 30.0 1.0
29 11.0 5.0 4.0 13.0 2.0 3.0 3.0 1.0 1.0 0.0 1.0 7.0 8.0 10.0 5.0 5.0 11.0 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
305157 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
305158 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
305159 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
305160 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
305161 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
305162 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0
305163 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
305164 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
305165 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0
305166 633.0 591.0 272.0 281.0 218.0 231.0 165.0 157.0 81.0 13.0 545.0 646.0 503.0 537.0 408.0 389.0 305.0 216.0
305167 455.0 398.0 161.0 202.0 142.0 154.0 91.0 95.0 80.0 14.0 335.0 468.0 339.0 405.0 268.0 194.0 224.0 135.0
305168 235.0 210.0 82.0 104.0 106.0 98.0 48.0 71.0 65.0 10.0 245.0 280.0 171.0 216.0 132.0 124.0 116.0 80.0
305169 262.0 235.0 92.0 132.0 107.0 110.0 56.0 69.0 63.0 10.0 248.0 304.0 200.0 228.0 154.0 138.0 120.0 82.0
305170 431.0 390.0 139.0 185.0 158.0 163.0 110.0 103.0 84.0 17.0 411.0 470.0 295.0 362.0 259.0 268.0 212.0 136.0
305171 2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0
305172 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
305173 78.0 70.0 31.0 30.0 28.0 29.0 24.0 20.0 16.0 4.0 75.0 93.0 73.0 67.0 56.0 50.0 57.0 36.0
305174 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
305175 45.0 46.0 18.0 21.0 24.0 15.0 14.0 11.0 3.0 2.0 24.0 64.0 45.0 49.0 39.0 23.0 26.0 24.0
305176 203.0 193.0 90.0 90.0 56.0 70.0 53.0 47.0 24.0 6.0 169.0 195.0 164.0 155.0 145.0 136.0 102.0 69.0
305177 148.0 161.0 54.0 71.0 41.0 68.0 38.0 41.0 23.0 5.0 91.0 177.0 135.0 118.0 102.0 76.0 76.0 36.0
305178 86.0 89.0 31.0 29.0 18.0 39.0 18.0 25.0 5.0 7.0 48.0 101.0 74.0 61.0 52.0 35.0 42.0 22.0
305179 222.0 240.0 70.0 102.0 56.0 73.0 50.0 58.0 24.0 7.0 176.0 247.0 186.0 177.0 121.0 118.0 103.0 64.0
305180 151.0 144.0 55.0 69.0 34.0 65.0 27.0 31.0 16.0 9.0 92.0 185.0 119.0 130.0 95.0 78.0 73.0 43.0
305181 126.0 111.0 40.0 40.0 21.0 49.0 25.0 27.0 8.0 9.0 60.0 133.0 103.0 97.0 64.0 41.0 49.0 34.0
305182 295.0 257.0 96.0 114.0 60.0 105.0 60.0 90.0 26.0 12.0 168.0 336.0 236.0 207.0 171.0 119.0 145.0 67.0
305183 202.0 206.0 65.0 80.0 41.0 69.0 26.0 58.0 15.0 6.0 106.0 267.0 179.0 167.0 113.0 81.0 98.0 44.0
305184 192.0 199.0 69.0 98.0 65.0 77.0 70.0 61.0 25.0 15.0 190.0 235.0 151.0 149.0 138.0 87.0 104.0 71.0
305185 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
305186 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

305187 rows × 18 columns

In [115]:
MAT = pd.DataFrame(data=mat)
MAT.to_csv(direc + '14_visualisation/data/snATACseq_embryo_revision01_pooledNumber_OCR_perCelltype.csv', sep=',')
#save clusters
#save peaks used
CELL = pd.DataFrame(data=np.unique(adata.obs['louvain']))
CELL.to_csv(direc + '14_visualisation/data/snATACseq_embryo_revision01_pooledNumber_OCR_perCelltype_celltypeLabel.csv', sep=',')

PEAK = pd.DataFrame(data=adata.var_names)

PEAK.to_csv(direc + '14_visualisation/data/snATACseq_embryo_revision01_pooledNumber_OCR_perCelltype_peakLabel.csv', sep=',')
In [116]:
MAT.shape
Out[116]:
(305187, 18)